1 DARWIN Univariate

1.0.1 Loading the libraries

library("FRESA.CAD")
library(psych)
library(whitening)
library("vioplot")

library(readxl)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 The Data

DARWIN <- read.csv("~/GitHub/FCA/Data/DARWIN/DARWIN.csv")
rownames(DARWIN) <- DARWIN$ID
DARWIN$ID <- NULL
DARWIN$class <- 1*(DARWIN$class=="P")
print(table(DARWIN$class))
#> 
#>  0  1 
#> 85 89

DARWIN[,1:ncol(DARWIN)] <- sapply(DARWIN,as.numeric)

signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(DARWIN) %in% c("class"));
DARWIN[,whof] <- signedlog(DARWIN[,whof])

1.1.0.1 Standarize the names for the reporting

dataframe <- DARWIN
outcome <- "class"

trainFraction <- 0.5
rhoThreshold <- 0.6
TopVariables <- 5
aucTHR <- 0.55

set.seed(5)
trainSample <- sample(nrow(dataframe),nrow(dataframe)*trainFraction)

trainDataFrame <- dataframe[trainSample,]
testDataFrame <- dataframe[-trainSample,]

1.1.1 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
174 450
pander::pander(table(dataframe[,outcome]))
0 1
85 89
pander::pander(table(trainDataFrame[,outcome]))
0 1
45 42
pander::pander(table(testDataFrame[,outcome]))
0 1
40 47

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

1.2 Univariate


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","IDI","ROCAUC","cStatCorr")
univar <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               trainDataFrame,
              rankingTest = "CStat")

100 : mean_jerk_in_air6 200 : disp_index12 300 : mean_speed_in_air17 400 : gmrt_on_paper23



#univar$orderframe[1:5,univariate_columns]
univarTest <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               trainDataFrame,
               testData=testDataFrame,
              rankingTest = "CStat")

100 : mean_jerk_in_air6 200 : disp_index12 300 : mean_speed_in_air17 400 : gmrt_on_paper23


univar$orderframe$BACC <- (univar$orderframe$Sensitivity + univar$orderframe$Specificity)/2.0
univarTest$orderframe$BACC <- (univarTest$orderframe$Sensitivity + univarTest$orderframe$Specificity)/2.0

#pROC::roc(trainDataFrame$class,trainDataFrame[,univar$orderframe$Name[1]],direction=">",plot=TRUE,auc=TRUE,quiet = TRUE)

1.3 Decorrelation with UPSTM Blind

DEdataframe <- IDeA(trainDataFrame,thr=rhoThreshold)
predTestDe <- predictDecorrelate(DEdataframe,testDataFrame)

ltvar <- getLatentCoefficients(DEdataframe);
pander::pander(head(ltvar))
  • La_air_time1:

    air_time1 gmrt_in_air1
    1 1.2
  • La_max_y_extension1:

    disp_index1 gmrt_on_paper1 max_y_extension1
    -0.666 -0.325 1
  • La_mean_acc_on_paper1:

    gmrt_on_paper1 mean_acc_on_paper1
    -0.479 1
  • La_mean_gmrt1:

    gmrt_in_air1 gmrt_on_paper1 mean_gmrt1
    -0.668 -0.371 1
  • La_mean_jerk_in_air1:

    mean_acc_in_air1 mean_jerk_in_air1
    -1.12 1
  • La_mean_jerk_on_paper1:

    mean_acc_on_paper1 mean_jerk_on_paper1
    -0.597 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
2.44
pander::pander(c(Latent=length(ltvar)))
Latent
302


varlistDe <-  colnames(DEdataframe)[colnames(DEdataframe) != outcome];
univarDe <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DEdataframe,
              rankingTest = "CStat")

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23


univarDeTest <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DEdataframe,
              testData=predTestDe,
              rankingTest = "CStat")

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23


univarDe$orderframe$BACC <- (univarDe$orderframe$Sensitivity + univarDe$orderframe$Specificity)/2.0
univarDeTest$orderframe$BACC <- (univarDeTest$orderframe$Sensitivity + univarDeTest$orderframe$Specificity)/2.0

#univarDe$orderframe[1:5,univariate_columns]
#univarDeTest$orderframe[1:5,univariate_columns]

#pROC::roc(DEdataframe$class,DEdataframe[,univarDe$orderframe$Name[1]],direction=">",plot=TRUE,auc=TRUE,quiet = TRUE)

1.4 Decorrelation with UPSTM Blind/Spearman

DEdataframeSpear <- IDeA(trainDataFrame,thr=rhoThreshold,method="spearman")
predTestDeSpear <- predictDecorrelate(DEdataframeSpear,testDataFrame)

ltvar <- getLatentCoefficients(DEdataframeSpear);
pander::pander(head(ltvar))
  • La_air_time1:

    air_time1 gmrt_in_air1
    1 1.2
  • La_mean_acc_on_paper1:

    gmrt_on_paper1 mean_acc_on_paper1
    -0.479 1
  • La_mean_gmrt1:

    gmrt_in_air1 gmrt_on_paper1 mean_gmrt1
    -0.626 -0.371 1
  • La_mean_jerk_in_air1:

    mean_acc_in_air1 mean_jerk_in_air1
    -1.12 1
  • La_mean_speed_in_air1:

    gmrt_in_air1 mean_speed_in_air1
    -0.79 1
  • La_mean_speed_on_paper1:

    gmrt_on_paper1 mean_speed_on_paper1
    -1.03 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
2.37
pander::pander(c(Latent=length(ltvar)))
Latent
272


varlistDeSpear <-  colnames(DEdataframeSpear)[colnames(DEdataframeSpear) != outcome];
univarDeSpear <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DEdataframeSpear,
              rankingTest = "CStat")

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23


univarDeSpearTest <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DEdataframeSpear,
              testData=predTestDeSpear,
              rankingTest = "CStat")

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23


univarDeSpear$orderframe$BACC <- (univarDeSpear$orderframe$Sensitivity + univarDeSpear$orderframe$Specificity)/2.0
univarDeSpearTest$orderframe$BACC <- (univarDeSpearTest$orderframe$Sensitivity + univarDeSpearTest$orderframe$Specificity)/2.0

1.5 Decorrelation with UPSTM Driven


DriDEdataframe <- IDeA(trainDataFrame,Outcome=outcome,thr=rhoThreshold)
predTestDri <- predictDecorrelate(DriDEdataframe,testDataFrame)


ltvar <- getLatentCoefficients(DriDEdataframe);
pander::pander(head(ltvar))
  • La_gmrt_in_air1:

    air_time1 gmrt_in_air1
    0.371 1
  • La_gmrt_on_paper1:

    gmrt_on_paper1 mean_speed_on_paper1
    1 -0.922
  • La_max_y_extension1:

    disp_index1 max_y_extension1
    -0.666 1
  • La_mean_acc_on_paper1:

    mean_acc_on_paper1 mean_speed_on_paper1
    1 -0.465
  • La_mean_gmrt1:

    gmrt_in_air1 mean_gmrt1 mean_speed_on_paper1
    -0.799 1 -0.258
  • La_mean_jerk_in_air1:

    mean_acc_in_air1 mean_jerk_in_air1
    -1.12 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
2.48
pander::pander(c(Latent=length(ltvar)))
Latent
307


varlistDe <-  colnames(DriDEdataframe)[colnames(DriDEdataframe) != outcome];
univarDeDri <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframe,
              rankingTest = "CStat")

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : La_gmrt_on_paper23


univarDeDriTest <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframe,
              testData=predTestDri,
              rankingTest = "CStat")

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : La_gmrt_on_paper23


univarDeDri$orderframe$BACC <- (univarDeDri$orderframe$Sensitivity + univarDeDri$orderframe$Specificity)/2.0
univarDeDriTest$orderframe$BACC <- (univarDeDriTest$orderframe$Sensitivity + univarDeDriTest$orderframe$Specificity)/2.0

1.6 Decorrelation with UPSTM Driven and Spearman


DriDEdataframeSpear <- IDeA(trainDataFrame,Outcome=outcome,thr=rhoThreshold,method="spearman")
predTestDriSpear <- predictDecorrelate(DriDEdataframeSpear,testDataFrame)


ltvar <- getLatentCoefficients(DriDEdataframeSpear);
pander::pander(head(ltvar))
  • La_air_time1:

    air_time1 total_time1
    1 -1.48
  • La_disp_index1:

    air_time1 disp_index1 max_y_extension1 total_time1
    0.301 1 -0.414 -0.904
  • La_gmrt_on_paper1:

    gmrt_on_paper1 mean_speed_on_paper1
    1 -0.922
  • La_mean_acc_on_paper1:

    mean_acc_on_paper1 mean_speed_on_paper1
    1 -0.465
  • La_mean_gmrt1:

    gmrt_in_air1 mean_gmrt1 mean_speed_on_paper1
    -0.67 1 -0.258
  • La_mean_jerk_in_air1:

    mean_acc_in_air1 mean_jerk_in_air1
    -1.12 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
2.36
pander::pander(c(Latent=length(ltvar)))
Latent
279


varlistDeSpear <-  colnames(DriDEdataframeSpear)[colnames(DriDEdataframeSpear) != outcome];
univarDeDriSpear <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframeSpear,
              rankingTest = "CStat")

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23


univarDeDriSpearTest <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframeSpear,
              testData=predTestDriSpear,
              rankingTest = "CStat")

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23


univarDeDriSpear$orderframe$BACC <- (univarDeDriSpear$orderframe$Sensitivity + univarDeDriSpear$orderframe$Specificity)/2.0
univarDeDriSpearTest$orderframe$BACC <- (univarDeDriSpearTest$orderframe$Sensitivity + univarDeDriSpearTest$orderframe$Specificity)/2.0

1.6.1 Get continous correlated features

iscontinous <- sapply(apply(trainDataFrame,2,unique),length) > 5 ## Only variables with enough samples

noclassData <- trainDataFrame[,iscontinous]
cmat <- cor(noclassData);
diag(cmat) <- 0;
maxcor <- apply(cmat>rhoThreshold,2,sum);
topcor <- names(maxcor[maxcor > 0]) ## Only correlated features will be PCA
pander::pander(c(Ncor=length(topcor)))
Ncor
421
cmat <- NULL

notcorr <- colnames(trainDataFrame)[!(colnames(trainDataFrame) %in% topcor)]
noclassData <- noclassData[,topcor]
noclassTestData <- testDataFrame[,topcor]

1.7 PCA Analysis


### PCA 

pc <- principal(noclassData,4*TopVariables,rotate="varimax")   #principal components
pander::pander(t(pc$loadings[1:TopVariables,1:TopVariables]))
  air_time1 disp_index1 gmrt_in_air1 gmrt_on_paper1 max_y_extension1
RC1 -0.1152 -0.1668 0.1647 0.1660 -0.0518
RC9 -0.0134 -0.1410 -0.1646 0.0938 -0.1508
RC2 -0.0113 0.1068 0.0681 0.0495 0.0861
RC10 0.4577 0.2534 -0.1914 -0.1393 0.1143
RC8 -0.1008 -0.0385 0.1393 0.1731 0.0730
PCA_Train <- as.data.frame(cbind(predict(pc,noclassData),trainDataFrame[,notcorr]))
colnames(PCA_Train) <- c(colnames(predict(pc,noclassData)),notcorr)

PCA_Predicted <- as.data.frame(cbind(predict(pc,noclassTestData),testDataFrame[,notcorr]))
colnames(PCA_Predicted) <- c(colnames(predict(pc,noclassTestData)),notcorr)

iscontinous <- colnames(PCA_Predicted)[sapply(apply(PCA_Predicted,2,unique),length) > 5] ## Only variables with enough samples
varlistPCA <-  iscontinous;

univarPCA <- uniRankVar(varlistPCA,
              paste(outcome,"~1"),
                outcome,
              PCA_Train,
              rankingTest = "CStat")

univarPCATest <- uniRankVar(varlistPCA,
              paste(outcome,"~1"),
                outcome,
              PCA_Train,
              testData=PCA_Predicted,
              rankingTest = "CStat")

univarPCA$orderframe$BACC <- (univarPCA$orderframe$Sensitivity + univarPCA$orderframe$Specificity)/2.0
univarPCATest$orderframe$BACC <- (univarPCATest$orderframe$Sensitivity + univarPCATest$orderframe$Specificity)/2.0

1.8 EFA


uls <- fa(noclassData,4*TopVariables,rotate="varimax")  #unweighted least squares is minres 
pander::pander(t(uls$weights[1:TopVariables,1:TopVariables])) 
  air_time1 disp_index1 gmrt_in_air1 gmrt_on_paper1 max_y_extension1
MR1 -0.1173 -0.168 0.1638 0.1644 -0.0541
MR9 0.0111 0.141 0.1660 -0.0927 0.1481
MR2 -0.0107 0.105 0.0670 0.0476 0.0827
MR4 0.4646 0.260 -0.2040 -0.1575 0.1128
MR6 -0.1456 -0.105 -0.0519 0.0449 -0.1190
EFA_Train <- as.data.frame(cbind(predict(uls,noclassData),trainDataFrame[,notcorr]))
colnames(EFA_Train) <- c(colnames(predict(uls,noclassData)),notcorr)
EFA_Predicted <- as.data.frame(cbind(predict(uls,noclassTestData),testDataFrame[,notcorr]))
colnames(EFA_Predicted) <- c(colnames(predict(uls,noclassTestData)),notcorr)

iscontinous <- colnames(EFA_Predicted)[sapply(apply(EFA_Predicted,2,unique),length) > 5] ## Only variables with enough 
varlistEFA <-  iscontinous
univarEFA <- uniRankVar(varlistEFA,
              paste(outcome,"~1"),
                outcome,
              EFA_Train,
              rankingTest = "CStat")

univarEFATest <- uniRankVar(varlistEFA,
              paste(outcome,"~1"),
                outcome,
              EFA_Train,
              testData=EFA_Predicted,
              rankingTest = "CStat")

univarEFA$orderframe$BACC <- (univarEFA$orderframe$Sensitivity + univarEFA$orderframe$Specificity)/2.0
univarEFATest$orderframe$BACC <- (univarEFATest$orderframe$Sensitivity + univarEFATest$orderframe$Specificity)/2.0

1.9 White

WhiteMat = whiteningMatrix(cov(noclassData), method="PCA")
sum(is.na(WhiteMat))

[1] 64413

tokeep <- apply(is.na(WhiteMat),1,sum) == 0
WhiteMat <- WhiteMat[tokeep,]
sum(is.na(WhiteMat))

[1] 0

sum(apply(abs(WhiteMat),1,sum) > 1.0e6)

[1] 182

tokeep <- apply(abs(WhiteMat),1,sum) < 1.0e6
WhiteMat <- WhiteMat[tokeep,]
sum(apply(abs(WhiteMat),1,sum) > 1.0e6)

[1] 0


pander::pander(c(ncol=ncol(WhiteMat),nrow=nrow(WhiteMat)))
ncol nrow
421 86

pander::pander(WhiteMat[1:TopVariables,1:TopVariables]) 
  air_time1 disp_index1 gmrt_in_air1 gmrt_on_paper1 max_y_extension1
L1 0.000306 -7.76e-05 -1.41e-04 -7.38e-05 -6.95e-05
L2 0.000087 6.40e-05 1.29e-04 8.16e-05 1.11e-05
L3 0.000696 4.66e-05 2.90e-04 2.07e-05 2.37e-04
L4 -0.000321 -1.31e-04 -1.04e-05 3.75e-05 7.30e-05
L5 0.020908 5.96e-03 -7.62e-03 -7.98e-03 2.52e-03
PCAWhite_Train <- as.data.frame(cbind(tcrossprod(as.matrix(noclassData), WhiteMat),trainDataFrame[,notcorr]))
colnames(PCAWhite_Train) <- c(colnames(tcrossprod(as.matrix(noclassData), WhiteMat)),notcorr)

sum(is.na(PCAWhite_Train))

[1] 0




PCAWhitePredicted <- as.data.frame(cbind(tcrossprod(as.matrix(noclassTestData), WhiteMat),testDataFrame[,notcorr]))
colnames(PCAWhitePredicted) <- c(colnames(tcrossprod(as.matrix(noclassTestData), WhiteMat)),notcorr)

sum(is.na(PCAWhitePredicted))

[1] 0


iscontinous <- colnames(PCAWhitePredicted)[sapply(apply(PCAWhitePredicted,2,unique),length) > 5] ## Only variables with enough 
varlistWhite <-  iscontinous

univarWhite <- uniRankVar(varlistWhite,
              paste(outcome,"~1"),
                outcome,
              PCAWhite_Train,
              rankingTest = "CStat")

100 : num_of_pendown14



univarWhiteTest <- uniRankVar(varlistWhite,
              paste(outcome,"~1"),
                outcome,
              PCAWhite_Train,
              testData=PCAWhitePredicted,
              rankingTest = "CStat")

100 : num_of_pendown14


univarWhite$orderframe$BACC <- (univarWhite$orderframe$Sensitivity + univarWhite$orderframe$Specificity)/2.0
univarWhiteTest$orderframe$BACC <- (univarWhiteTest$orderframe$Sensitivity + univarWhiteTest$orderframe$Specificity)/2.0

1.10 Correlation Matrices

1.10.1 RAW

par(cex=1.0,cex.main=0.8)
breaks <- c(0:5)/5.0;

cormat <- cor(testDataFrame,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
pander::pander(max(abs(cormat)))

0.999

pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.327 0.397 0.475 0.585 0.917
pander::pander(c(Raw_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
Raw_fraction
0.00925

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Raw Correlation",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature"
                  )


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Raw Correlation",xlab="Spearman Correlation")

rawDen <- density(cormat,from=-1,to=1)
par(op)

1.10.2 UPSTM Blind

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DEdataframe,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.6
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.216 0.265 0.312 0.379 0.54
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0

## Test Correlation
cormat <- cor(predTestDe,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.912
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.224 0.273 0.322 0.393 0.589
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.000875

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after IDeA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM",xlab="Spearman Correlation")

DeDen <- density(cormat,from=-1,to=1)


par(op)

1.10.3 UPSTM Blind/Spearman

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DEdataframeSpear,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.999
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.218 0.27 0.322 0.398 0.975
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00213

## Test Correlation
cormat <- cor(predTestDeSpear,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.933
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.229 0.28 0.329 0.404 0.625
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00132

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after IDeA:Spearman",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM",xlab="Spearman Correlation")

DeSpearDen <- density(cormat,from=-1,to=1)

par(op)

1.10.4 UPSTM Driven

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DriDEdataframe,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.599
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.214 0.263 0.31 0.377 0.532
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0

## Test Correlation
cormat <- cor(DriDEdataframe,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.883
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.22 0.268 0.316 0.383 0.56
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.000541

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after Driven-IDeA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")

#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after Driven-UPSTM",xlab="Spearman Correlation")

DeDrivDen <- density(cormat,from=-1,to=1)
par(op)

1.10.5 UPSTM Spearman

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DriDEdataframeSpear,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.999
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.218 0.271 0.324 0.404 0.974
pander::pander(c(IDeAS_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeAS_fraction
0.0022

## Test Correlation

cormat <- cor(predTestDriSpear,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.961
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.229 0.28 0.331 0.404 0.623
pander::pander(c(IDeAS_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeAS_fraction
0.00121

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation: Driven/Spearman",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM with Spearman",xlab="Spearman Correlation")

DeDrivSpearDen <- density(cormat,from=-1,to=1)
par(op)

1.10.6 PCA

par(cex=1.0,cex.main=0.8)



## Train Correlation

cormat <- cor(PCA_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.876
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.33 0.429 0.579 0.729 0.845
pander::pander(c(PCA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCA_fraction
0.0216

## Test Correlation
cormat <- cor(PCA_Predicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.907
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.404 0.571 0.711 0.82 0.904
pander::pander(c(PCA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCA_fraction
0.0464

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after PCA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after PCA",xlab="Spearman Correlation")

PCADen <- density(cormat,from=-1,to=1)

par(op)

1.10.7 EFA

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(EFA_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.878
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.33 0.439 0.577 0.735 0.849
pander::pander(c(EFA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
EFA_fraction
0.0216

## Test Correlation
cormat <- cor(EFA_Predicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.909
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.404 0.58 0.707 0.82 0.907
pander::pander(c(EFA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
EFA_fraction
0.0464

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after EFA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after EFA",xlab="Spearman Correlation")

EFADen <- density(cormat,from=-1,to=1)
par(op)

1.10.8 PCA Whitening



## Train Correlation

cormat <- cor(PCAWhite_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.669
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.138 0.182 0.229 0.282 0.417
pander::pander(c(PCAWhite_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCAWhite_fraction
0.000297

## Test Correlation
cormat <- cor(PCAWhitePredicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.858
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.223 0.265 0.301 0.351 0.511
pander::pander(c(PCAWhite_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCAWhite_fraction
0.000297

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after PCAWhite",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after PCAWhite",xlab="Spearman Correlation")

WhiteDen <- density(cormat,from=-1,to=1)
par(op)

1.11 The Density Plot

par(cex=0.7)
colors=c("red","blue","green","darkblue","darkgreen","purple","orange","darkred");

plot(rawDen,
     xlim=c(-1,1),
     ylim=c(0.001,7.0),
     col=colors[1],
     lty=1,
     lwd=4,
     log="y",
     main="Test: Correlation Distribution",xlab="Spearman Correlation")

lines(DeDen,col=colors[2],lty=2,lwd=4)
lines(DeSpearDen,col=colors[3],lty=3,lwd=4)
lines(DeDrivDen,col=colors[4],lty=4,lwd=2)
lines(DeDrivSpearDen,col=colors[5],lty=5,lwd=2)

lines(PCADen,col=colors[6],lty=6,lwd=1)
lines(EFADen,col=colors[7],lty=7,lwd=1)
lines(WhiteDen,col=colors[8],lty=8,lwd=1)

names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
#colors=c("red","blue","green","blue","green","purple","purple","gray");
lines=c(1,2,3,4,5,6,7,8)
lwds=c(4,4,4,2,2,1,1,1)

legend("topleft",names,col=colors,lty=lines,lwd=lwds,cex=0.50)

par(op)

1.11.1 Differences between train and test ROC AUC

par(op)
par(mfrow=c(1,1),cex=0.7)

AUCResults <- list();
diffAUC <- list();
thenames <- rownames(univar$orderframe)[(rownames(univar$orderframe) %in% colnames(testDataFrame))]
rawAUC <- univar$orderframe[thenames,"ROCAUC"]
thenames <- thenames[rawAUC >= aucTHR]
rawAUC <- univar$orderframe[thenames,"ROCAUC"]
rawAUCTest <- univarTest$orderframe[thenames,"ROCAUC"]
AUCResults$RAW <- rawAUCTest
diffAUC$RAW <-  rawAUCTest-rawAUC
plot(rawAUC,rawAUCTest-rawAUC,
     xlab="TRAIN ROC AUC",
     ylab="Test:AUC-Train:AUC",
     xlim=c(0.5,1.0),
     ylim=c(-0.25,0.25),
     pch=1,
     col=colors[1],
     main="ROC AUC Difference Between Test and Train")

thenames <- rownames(univarDe$orderframe)[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAP <- univarDe$orderframe[thenames,"ROCAUC"]
thenames <- thenames[IDeAP >= aucTHR]
IDeAP <- univarDe$orderframe[thenames,"ROCAUC"]
IDeAPTest <- univarDeTest$orderframe[thenames,"ROCAUC"]
AUCResults$IDeAP <- IDeAP
AUCResults$IDeAP_T <- IDeAPTest
diffAUC$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

thenames <- rownames(univarDeSpear$orderframe)[!(rownames(univarDeSpear$orderframe) %in% colnames(testDataFrame))]
IDeAS <- univarDeSpear$orderframe[thenames,"ROCAUC"]
thenames <- thenames[IDeAS >= aucTHR]
IDeAS <- univarDeSpear$orderframe[thenames,"ROCAUC"]
IDeASTest <- univarDeSpearTest$orderframe[thenames,"ROCAUC"]
AUCResults$IDeAS <- IDeAS
AUCResults$IDeAS_T <- IDeASTest
diffAUC$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

thenames <- rownames(univarDeDri$orderframe)[!(rownames(univarDeDri$orderframe) %in% colnames(testDataFrame))]
DIDeAP <- univarDeDri$orderframe[thenames,"ROCAUC"]
thenames <- thenames[DIDeAP >= aucTHR]
DIDeAP <- univarDeDri$orderframe[thenames,"ROCAUC"]
DIDeAPTest <- univarDeDriTest$orderframe[thenames,"ROCAUC"]
AUCResults$DIDeAP <- DIDeAP
AUCResults$DIDeAP_T <- DIDeAPTest
diffAUC$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

thenames <- rownames(univarDeDriSpear$orderframe)[!(rownames(univarDeDriSpear$orderframe) %in% colnames(testDataFrame))]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"ROCAUC"]
thenames <- thenames[DIDeAS >= aucTHR]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"ROCAUC"]
DIDeASTest <- univarDeDriSpearTest$orderframe[thenames,"ROCAUC"]
AUCResults$DIDeAS <- DIDeAS
AUCResults$DIDeAS_T <- DIDeASTest
diffAUC$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

thenames <- rownames(univarPCA$orderframe)[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCA <- univarPCA$orderframe[thenames,"ROCAUC"]
thenames <- thenames[PCA >= aucTHR]
PCA <- univarPCA$orderframe[thenames,"ROCAUC"]
PCATest <- univarPCATest$orderframe[thenames,"ROCAUC"]
AUCResults$PCA <- PCA
AUCResults$PCA_T <- PCATest
diffAUC$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

thenames <- rownames(univarEFA$orderframe)[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFA <- univarEFA$orderframe[thenames,"ROCAUC"]
thenames <- thenames[EFA >= aucTHR]
EFA <- univarEFA$orderframe[thenames,"ROCAUC"]

EFATest <- univarEFATest$orderframe[thenames,"ROCAUC"]
AUCResults$EFA <- EFA
AUCResults$EFA_T <- EFATest
diffAUC$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

thenames <- rownames(univarWhite$orderframe)[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCA <- univarWhite$orderframe[thenames,"ROCAUC"]
thenames <- thenames[WPCA >= aucTHR]
WPCA <- univarWhite$orderframe[thenames,"ROCAUC"]
WPCATest <- univarWhiteTest$orderframe[thenames,"ROCAUC"]
AUCResults$WPCA <- WPCA
AUCResults$WPCA_T <- WPCATest
diffAUC$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.2 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffAUC,
        ylim=c(-0.25,0.25),
        ylab="Test-Train",
        main="Test-Train Paired ROC AUC Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffAUC, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffAUC),lapply(diffAUC,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.3 Distribution of ROC AUC in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(AUCResults)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(AUCResults,
        ylim=c(0.3,1.0),
        ylab="ROC AUC",
        main="ROC AUC of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
abline(h=0.5,col="black")
stripchart(AUCResults, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(AUCResults),lapply(AUCResults,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.4 Differences between train and test Balanced Accuracy

par(op)
par(mfrow=c(1,1),cex=0.7)
BACCTHR <- aucTHR
BACCResults <- list();
diffBACC <- list();
thenames <- rownames(univar$orderframe)[(rownames(univar$orderframe) %in% colnames(testDataFrame))]
rawBACC <- univar$orderframe[thenames,"BACC"]
thenames <- thenames[rawBACC >= BACCTHR]
rawBACC <- univar$orderframe[thenames,"BACC"]
rawBACCTest <- univarTest$orderframe[thenames,"BACC"]
BACCResults$RAW <- rawBACCTest
diffBACC$RAW <-  rawBACCTest-rawBACC
plot(rawBACC,rawBACCTest-rawBACC,
     xlab="TRAIN Balanced Acc",
     ylab="Test:BACC-Train:BACC",
     xlim=c(0.5,1.0),
     ylim=c(-0.25,0.25),
     pch=1,
     col=colors[1],
     main="Balanced Acc Difference Between Test and Train")

thenames <- rownames(univarDe$orderframe)[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAP <- univarDe$orderframe[thenames,"BACC"]
thenames <- thenames[IDeAP >= BACCTHR]
IDeAP <- univarDe$orderframe[thenames,"BACC"]
IDeAPTest <- univarDeTest$orderframe[thenames,"BACC"]
BACCResults$IDeAP <- IDeAP
BACCResults$IDeAP_T <- IDeAPTest
diffBACC$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

thenames <- rownames(univarDeSpear$orderframe)[!(rownames(univarDeSpear$orderframe) %in% colnames(testDataFrame))]
IDeAS <- univarDeSpear$orderframe[thenames,"BACC"]
thenames <- thenames[IDeAS >= BACCTHR]
IDeAS <- univarDeSpear$orderframe[thenames,"BACC"]
IDeASTest <- univarDeSpearTest$orderframe[thenames,"BACC"]
BACCResults$IDeAS <- IDeAS
BACCResults$IDeAS_T <- IDeASTest
diffBACC$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

thenames <- rownames(univarDeDri$orderframe)[!(rownames(univarDeDri$orderframe) %in% colnames(testDataFrame))]
DIDeAP <- univarDeDri$orderframe[thenames,"BACC"]
thenames <- thenames[DIDeAP >= BACCTHR]
DIDeAP <- univarDeDri$orderframe[thenames,"BACC"]
DIDeAPTest <- univarDeDriTest$orderframe[thenames,"BACC"]
BACCResults$DIDeAP <- DIDeAP
BACCResults$DIDeAP_T <- DIDeAPTest
diffBACC$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

thenames <- rownames(univarDeDriSpear$orderframe)[!(rownames(univarDeDriSpear$orderframe) %in% colnames(testDataFrame))]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"BACC"]
thenames <- thenames[DIDeAS >= BACCTHR]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"BACC"]
DIDeASTest <- univarDeDriSpearTest$orderframe[thenames,"BACC"]
BACCResults$DIDeAS <- DIDeAS
BACCResults$DIDeAS_T <- DIDeASTest
diffBACC$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

thenames <- rownames(univarPCA$orderframe)[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCA <- univarPCA$orderframe[thenames,"BACC"]
thenames <- thenames[PCA >= BACCTHR]
PCA <- univarPCA$orderframe[thenames,"BACC"]
PCATest <- univarPCATest$orderframe[thenames,"BACC"]
BACCResults$PCA <- PCA
BACCResults$PCA_T <- PCATest
diffBACC$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

thenames <- rownames(univarEFA$orderframe)[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFA <- univarEFA$orderframe[thenames,"BACC"]
thenames <- thenames[EFA >= BACCTHR]
EFA <- univarEFA$orderframe[thenames,"BACC"]

EFATest <- univarEFATest$orderframe[thenames,"BACC"]
BACCResults$EFA <- EFA
BACCResults$EFA_T <- EFATest
diffBACC$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

thenames <- rownames(univarWhite$orderframe)[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCA <- univarWhite$orderframe[thenames,"BACC"]
thenames <- thenames[WPCA >= BACCTHR]
WPCA <- univarWhite$orderframe[thenames,"BACC"]
WPCATest <- univarWhiteTest$orderframe[thenames,"BACC"]
BACCResults$WPCA <- WPCA
BACCResults$WPCA_T <- WPCATest
diffBACC$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.5 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffBACC,
        ylim=c(-0.25,0.25),
        ylab="Test-Train",
        main="Test-Train Paired Balanced Acc Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffBACC, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffBACC),lapply(diffBACC,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.6 Distribution of Balanced Acc in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(BACCResults)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(BACCResults,
        ylim=c(0.3,1.0),
        ylab="Balanced Acc",
        main="Balanced Acc of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
abline(h=0.5,col="black")
stripchart(BACCResults, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(BACCResults),lapply(BACCResults,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.7 Differences between train and test IDI

par(op)
par(mfrow=c(1,1),cex=0.7)

testIDI <- list();
diffIDI <- list();
rawIDI <- univar$orderframe$IDI
rawIDITest <- univarTest$orderframe$IDI
testIDI$RAW <- rawIDITest
diffIDI$RAW <-  rawIDITest-rawIDI
plot(rawIDI,rawIDITest-rawIDI,
     xlab="TRAIN Test IDI",
     ylab="Test:IDI-Train:IDI",
     xlim=c(0,0.5),
     ylim=c(-0.2,0.2),
     pch=1,
     col=colors[1],
     main="Predict IDI Difference Between Test and Train")

IDeAP <- univarDe$orderframe$IDI[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAPTest <-univarDeTest$orderframe$IDI[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
testIDI$IDeAP <- IDeAP
testIDI$IDeAP_T <- IDeAPTest
diffIDI$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

IDeAS <- univarDeSpear$orderframe$IDI[!(rownames(univarDeSpearTest$orderframe) %in% colnames(testDataFrame))]
IDeASTest <- univarDeSpearTest$orderframe$IDI[!(rownames(univarDeSpearTest$orderframe) %in% colnames(testDataFrame))]
testIDI$IDeAS <- IDeAS
testIDI$IDeAS_T <- IDeASTest
diffIDI$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

DIDeAP <- univarDeDri$orderframe$IDI[!(rownames(univarDeDriTest$orderframe) %in% colnames(testDataFrame))]
DIDeAPTest <- univarDeDriTest$orderframe$IDI[!(rownames(univarDeDriTest$orderframe) %in% colnames(testDataFrame))]
testIDI$DIDeAP <- DIDeAP
testIDI$DIDeAP_T <- DIDeAPTest
diffIDI$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

DIDeAS <- univarDeDriSpear$orderframe$IDI[!(rownames(univarDeDriSpearTest$orderframe) %in% colnames(testDataFrame))]
DIDeASTest <- univarDeDriSpearTest$orderframe$IDI[!(rownames(univarDeDriSpearTest$orderframe) %in% colnames(testDataFrame))]
testIDI$DIDeAS <- DIDeAS
testIDI$DIDeAS_T <- DIDeASTest
diffIDI$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

PCA <- univarPCA$orderframe$IDI[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCATest <- univarPCATest$orderframe$IDI[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
testIDI$PCA <- PCA
testIDI$PCA_T <- PCATest
diffIDI$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

EFA <- univarEFA$orderframe$IDI[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFATest <- univarEFATest$orderframe$IDI[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
testIDI$EFA <- EFA
testIDI$EFA_T <- EFATest
diffIDI$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

WPCA <- univarWhite$orderframe$IDI[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCATest <- univarWhiteTest$orderframe$IDI[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
testIDI$WPCA <- WPCA
testIDI$WPCA_T <- WPCATest
diffIDI$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.8 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffIDI,
        ylim=c(-0.2,0.2),
        ylab="Test-Train",
        main="Test-Train Paired Predict IDI Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffIDI, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffIDI),lapply(diffIDI,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.9 Distribution of Predict IDI in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(testIDI)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(testIDI,
        ylim=c(0.0,0.5),
        ylab="Predict IDI",
        main="Predict IDI of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
stripchart(testIDI, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(testIDI),lapply(testIDI,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.10 The tables


pander::pander(univarTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
total_time15 38.0 0.757 37.1 0.440 0.7999 0.387 0.824 0.869
total_time23 37.3 0.572 36.7 0.331 0.0131 0.370 0.888 0.853
air_time15 37.7 0.904 36.6 0.650 0.5892 0.348 0.807 0.851
total_time8 36.8 0.745 36.0 0.525 0.7893 0.237 0.733 0.826
air_time23 36.7 0.698 35.9 0.526 0.4950 0.341 0.881 0.825
pander::pander(univarDeTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
air_time15 37.7 0.904 36.6 0.650 0.5892 0.348 0.807 0.851
air_time23 36.7 0.698 35.9 0.526 0.4950 0.341 0.881 0.825
air_time17 37.9 0.766 37.1 0.868 0.0105 0.257 0.807 0.812
air_time22 36.6 0.821 35.9 0.450 0.8395 0.259 0.749 0.793
air_time2 36.4 1.273 35.2 1.025 0.2103 0.209 0.769 0.786
pander::pander(univarDeSpearTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
air_time15 37.7 0.904 36.6 0.6498 0.58917 0.348 0.807 0.851
air_time23 36.7 0.698 35.9 0.5257 0.49496 0.341 0.881 0.825
pressure_mean4 35.1 0.224 35.2 0.0344 0.00651 0.170 0.718 0.802
air_time22 36.6 0.821 35.9 0.4500 0.83946 0.259 0.749 0.793
mean_acc_in_air17 27.6 0.601 28.1 0.5218 0.38808 0.193 0.765 0.779
pander::pander(univarDeDriTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
total_time15 38.0 0.757 37.1 0.440 0.79993 0.387 0.824 0.869
total_time23 37.3 0.572 36.7 0.331 0.01306 0.370 0.888 0.853
total_time8 36.8 0.745 36.0 0.525 0.78930 0.237 0.733 0.826
total_time17 38.5 0.578 37.9 0.668 0.00557 0.259 0.840 0.817
total_time3 37.1 0.612 36.3 0.616 0.67218 0.181 0.712 0.804
pander::pander(univarDeDriSpearTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
total_time23 37.3 0.572 36.7 0.3309 0.01306 0.3697 0.888 0.853
total_time8 36.8 0.745 36.0 0.5251 0.78930 0.2372 0.733 0.826
total_time17 38.5 0.578 37.9 0.6678 0.00557 0.2590 0.840 0.817
pressure_mean4 35.1 0.224 35.2 0.0344 0.00651 0.1696 0.718 0.802
total_time1 36.9 0.677 36.4 0.3647 0.84669 0.0364 0.532 0.795
pander::pander(univarPCATest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
RC15 11.55 14.92 -10.78 10.17 9.80e-01 0.507 0.926 0.908
RC7 5.60 4.68 -5.23 18.52 6.76e-06 0.307 0.734 0.883
RC10 18.73 25.77 -17.48 19.09 8.81e-01 0.423 0.887 0.872
RC4 -8.55 13.89 7.98 9.41 2.23e-01 0.328 0.789 0.869
RC9 -24.06 37.61 22.46 25.27 6.85e-01 0.410 0.882 0.851
pander::pander(univarEFATest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
MR17 11.44 14.8 -10.68 10.04 9.88e-01 0.507 0.925 0.908
MR7 5.55 4.6 -5.18 18.45 6.23e-06 0.310 0.734 0.887
MR4 19.12 26.0 -17.85 19.30 8.46e-01 0.428 0.886 0.875
MR14 -8.09 13.4 7.55 9.16 2.04e-01 0.317 0.787 0.865
MR9 23.79 37.2 -22.20 24.96 6.84e-01 0.410 0.882 0.852
pander::pander(univarWhiteTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
L5 21.65 0.841 20.31 0.648 7.86e-01 0.501728 0.890 0.902
L2 10.02 0.727 9.81 1.200 1.11e-10 -0.001042 0.696 0.807
L3 -3.42 0.665 -3.06 1.217 4.18e-11 0.151744 0.698 0.786
L4 13.36 1.280 13.40 0.653 1.96e-07 -0.000595 0.265 0.697
L1 -4.04 1.244 -4.40 0.670 1.02e-09 -0.001982 0.477 0.688

topUni <- univar$orderframe$Name[1:TopVariables]
topDe <- univarDe$orderframe$Name[1:TopVariables]
topDeSpear <- univarDeSpear$orderframe$Name[1:TopVariables]
topDeDri <- univarDeDri$orderframe$Name[1:TopVariables]
topDeDriSpear <- univarDeDriSpear$orderframe$Name[1:TopVariables]
topPCA <- univarPCA$orderframe$Name[1:TopVariables]
topEFA <- univarEFA$orderframe$Name[1:TopVariables]
topPCAWhite <- univarWhite$orderframe$Name[1:TopVariables]

1.11.11 Model of top variables

par(mfrow=c(1,2),cex=0.6)

lmRAW <- glm(paste(outcome,"~."),
             trainDataFrame[,c(outcome,topUni)],
             family="binomial")
prRaw <- predictionStats_binary(cbind(testDataFrame[,outcome],predict(lmRAW,testDataFrame)),"Top Raw",cex=0.75)

Top Raw


lmDe <- glm(paste(outcome,"~."),
            DEdataframe[,c(outcome,topDe)],
            family="binomial")
prDe <- predictionStats_binary(cbind(predTestDe[,outcome],predict(lmDe,predTestDe)),"Top IDeA:P",cex=0.75)

Top IDeA:P


lmDeSpear <- glm(paste(outcome,"~."),
            DEdataframeSpear[,c(outcome,topDeSpear)],
            family="binomial")
prSpear <- predictionStats_binary(cbind(predTestDeSpear[,outcome],predict(lmDeSpear,predTestDeSpear)),"Top IDeA:S",cex=0.75)

Top IDeA:S


lmDri <- glm(paste(outcome,"~."),
            DriDEdataframe[,c(outcome,topDeDri)],
            family="binomial")
prDri <- predictionStats_binary(cbind(predTestDe[,outcome],predict(lmDri,predTestDri)),"Top DIDeA:P",cex=0.75)

Top DIDeA:P


lmDriSpear <- glm(paste(outcome,"~."),
            DriDEdataframeSpear[,c(outcome,topDeDriSpear)],
            family="binomial")
prDriSpear <- predictionStats_binary(cbind(predTestDriSpear[,outcome],predict(lmDriSpear,predTestDriSpear)),"Top DIDeA:S",cex=0.7)

Top DIDeA:S



lmPCA <- glm(paste(outcome,"~."),
            PCA_Train[,c(outcome,topPCA)],
            family="binomial")
prPCA <- predictionStats_binary(cbind(PCA_Predicted[,outcome],predict(lmPCA,PCA_Predicted)),"Top PCA",cex=0.75)

Top PCA



lmEFA <- glm(paste(outcome,"~."),
            EFA_Train[,c(outcome,topEFA)],
            family="binomial")
prEFA <- predictionStats_binary(cbind(EFA_Predicted[,outcome],predict(lmEFA,EFA_Predicted)),"Top EFA",cex=0.75)

Top EFA



lmPCAW <- glm(paste(outcome,"~."),
            PCAWhite_Train[,c(outcome,topPCAWhite)],
            family="binomial")
prWPCA <- predictionStats_binary(cbind(PCAWhitePredicted[,outcome],predict(lmPCAW,PCAWhitePredicted)),"Top White:PCA",cex=0.75)

Top White:PCA

par(op)

1.11.12 The Performance Tables and Plots


par(cex=0.6)

 aucs <- prRaw$aucs
  aucs <- rbind(aucs,prDe$aucs)
  aucs <- rbind(aucs,prSpear$aucs)
  aucs <- rbind(aucs,prDri$aucs)
  aucs <- rbind(aucs,prDriSpear$aucs)
  aucs <- rbind(aucs,prPCA$aucs)
  aucs <- rbind(aucs,prEFA$aucs)
  aucs <- rbind(aucs,prWPCA$aucs)

  
  rownames(aucs) <- c("RAW",
                        "IDeA:P",
                        "IDeA:S",
                        "DIDeA:P",
                        "DIDeA:S",
                        "PCA",
                        "EFA",
                        "WPCA"
                        )
  
  pander::pander(aucs)
  est lower upper
RAW 0.884 0.806 0.961
IDeA:P 0.913 0.845 0.980
IDeA:S 0.905 0.831 0.979
DIDeA:P 0.898 0.830 0.967
DIDeA:S 0.879 0.801 0.958
PCA 0.852 0.773 0.932
EFA 0.851 0.771 0.931
WPCA 0.867 0.792 0.942
  
  bpAUC <- barPlotCiError(as.matrix(aucs),
                          metricname = "ROC AUC",
                          thesets = "Test AUC",
                          themethod = rownames(aucs),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomleft",inset=c(0.0,0),cex=0.5),
                          col = terrain.colors(nrow(aucs))
                          )


  
 berror <- prRaw$berror
  berror <- rbind(berror,prDe$berror)
  berror <- rbind(berror,prSpear$berror)
  berror <- rbind(berror,prDri$berror)
  berror <- rbind(berror,prDriSpear$berror)
  berror <- rbind(berror,prPCA$berror)
  berror <- rbind(berror,prEFA$berror)
  berror <- rbind(berror,prWPCA$berror)


  rownames(berror) <- rownames(aucs)
  pander::pander(berror)
  50% 2.5% 97.5%
RAW 0.198 0.1247 0.286
IDeA:P 0.151 0.0897 0.224
IDeA:S 0.175 0.1046 0.254
DIDeA:P 0.142 0.0714 0.213
DIDeA:S 0.199 0.1210 0.281
PCA 0.258 0.1705 0.355
EFA 0.255 0.1754 0.349
WPCA 0.208 0.1296 0.291

  bpBER <- barPlotCiError(as.matrix(berror),
                          metricname = "Balanced Error Rate",
                          thesets = "Test BER",
                          themethod = rownames(aucs),
                          main = "Balanced Error Rate",
                          offsets = c(0.5,1),
                          scoreDirection = "<",
                          ho=0.5,
                          args.legend = list(bg = "white",x="topleft",inset=c(0.0,0),cex=0.5),
                          col = terrain.colors(nrow(aucs))
                          )

  par(op)